﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using PorterStemmerAlgorithm;
using MachineLearning;
using HLDA;

namespace Amazon
{
    public class Book
    {
        public string id { get; set; }
        public string publisher { get; set; }
        public string title { get; set; }
        public List<Review> reviews { get; set; }

        public Book(string id, string publisher)
        {
            this.id = id;
            this.publisher = publisher;
            reviews = new List<Review>();
        }

        public Book(string id, string publisher, string title)
            : this(id, publisher)
        {
            this.title = title;
        }

        public void AddReview(Review rev)
        {
            reviews.Add(rev);
            rev.book = this;
        }
    }

    public class Review
    {
        public Book book { get; set; }
        public string authorId { get; set; }
        public DateTime date { get; set; }
        public int helpfulVotes { get; set; }
        public int totalVotes { get; set; }
        public string summary { get; set; }
        public string reviewText { get; set; }
        public double rating { get; set; }

        public Dictionary<int, int> wordCount;
        public HldaDoc doc { get; set; }

        public Review()
        {
            this.wordCount = new Dictionary<int, int>();
        }

        public void AddWord(int wordIndex)
        {
            if (wordCount.ContainsKey(wordIndex))
            {
                wordCount[wordIndex]++;
            }
            else
            {
                wordCount[wordIndex] = 1;
            }
        }

        public Review(string authorId, string date, int helpfulVotes, int totalVotes, double rating, string summary, string reviewText)
        {
            this.authorId = authorId;
            try
            {
                this.date = DateTime.Parse(date);
            }catch (FormatException) {
                this.date = DateTime.Now;
            }
            
            this.helpfulVotes = helpfulVotes;
            this.totalVotes = totalVotes;
            this.rating = rating;
            this.summary = summary;
            this.reviewText = reviewText;
        }

        public override string ToString()
        {
            return string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}", authorId, BookId, helpfulVotes, totalVotes, rating, summary, reviewText);
        }

        public string BookId
        {
            get { return book.id; }
        }
    }

    public class Amazon
    {
        public Dictionary<string, Book> books { get; set; }
        public Dictionary<string, int> vocabulary { get; set; }
        public List<Review> reviews { get; set; }

        public Amazon()
        {
            books = new Dictionary<string, Book>();
            vocabulary = new Dictionary<string, int>();
            reviews = new List<Review>();
        }

        public int GetWordIndex(string word)
        {
            int index;
            if (vocabulary.ContainsKey(word))
            {
                return vocabulary[word];
            }
            else
            {
                index = vocabulary.Count;
                vocabulary[word] = index;
            }
            return index;
        }

        public void AddReview(Review rev)
        {
            reviews.Add(rev);
            books[rev.BookId].AddReview(rev);
        }

        public void AddReview(string bookId, Review rev)
        {
            books[bookId].AddReview(rev);
            reviews.Add(rev);
        }

        public void AddBook(Book book)
        {
            if (!books.ContainsKey(book.id))
            {
                books[book.id] = book;
            }
        }
        
        public void WriteReviewsToFile(string filename)
        {
            Console.WriteLine(filename);
            StreamWriter sw = new StreamWriter(filename);
            sw.WriteLine("AuthorId\tBookId\tHelpful Votes\tTotal Votes\tRating\tSummary\tText");
            foreach (Book book in books.Values)
            {
                foreach (Review rev in book.reviews)
                {
                    sw.WriteLine(rev);
                }
            }
            sw.Close();
        }

        public void WriteToFile(string filename)
        {
            StreamWriter sw = new StreamWriter(filename);
            sw.WriteLine("ProductID\tPublisher");
            foreach (Book book in books.Values)
            {
                sw.WriteLine("{0}\t{1}\t{2}", book.id, book.publisher, book.title);
            }
            sw.Close();
        }

        public bool HasBook(string key)
        {
            return books.ContainsKey(key);
        }

        public void AddTitle(string key, string title)
        {
            books[key].title = title;
        }

        public void GetPublisherBooks(string publisher)
        {
            string file = @"Z:\Desktop\Amazon Reviews\Amazon Data\BooksInfo.txt";
            StreamReader sr = new StreamReader(file);
            string line = sr.ReadLine();
            line = sr.ReadLine();
            while (line != null)
            {
                //Console.WriteLine(line);
                string[] tokens = line.Split('\t');
                if (tokens[2].Equals(publisher))
                {
                    Console.WriteLine("BookId:{0}\tPress:{1}", tokens[1], tokens[2]);
                    AddBook(new Book(tokens[1], tokens[2]));
                }
                line = sr.ReadLine();
            }
            sr.Close();
            Console.WriteLine(publisher.Replace('/', '_'));
            WriteToFile(string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\{0} Books.txt", publisher.Replace('/','_')));
        }

        public void GetPublisherBookTitles(string publisher)
        {
            publisher = publisher.Replace('/', '_');
            string file = string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\{0} Books.txt", publisher);
            StreamReader sr = new StreamReader(file);
            string line = sr.ReadLine(); //read header
            line = sr.ReadLine();
            while (line != null)
            {
                string[] tokens = line.Split('\t');
                AddBook(new Book(tokens[0].Trim(), tokens[1].Trim()));
                line = sr.ReadLine();
            }
            sr.Close();

            file = @"Z:\Desktop\Amazon Reviews\Amazon Data\productinfo.txt";
            sr = new StreamReader(file);
            line = sr.ReadLine(); //read the BREAK

            while (line != null)
            {
                line = sr.ReadLine();
                if (line == null) break;
                string[] tokens = line.Split('\t');
                tokens[0] = tokens[0].Trim();
                if (HasBook(tokens[0]))
                {
                    Console.WriteLine("!{0}!", tokens[1]);
                    AddTitle(tokens[0], tokens[1].Trim());
                }
                do { line = sr.ReadLine(); } while (!line.Equals("BREAK-REVIEWED"));
            }
            sr.Close();
            WriteToFile(string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\{0} Books.txt", publisher));
        }

        public void ConvertToHLDAFormat(string reviewFile)
        {
            Regex regex = new Regex("[^a-zA-Z]");
            PorterStemmer porterStemmer = new PorterStemmer();
            string file = string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\{0}.txt", reviewFile);
            Console.WriteLine(file);
            StreamReader sr = new StreamReader(file);

            string line = sr.ReadLine(); //read headers
            line = sr.ReadLine();

            while (line != null)
            {
                string[] tokens = line.Split('\t');
                //text is in last two
                int summaryIndex = tokens.Length - 2;
                int textIndex = tokens.Length - 1;
                
                //now remove punctuations and numbers
                string longString = string.Format("{0} {1}", tokens[summaryIndex], tokens[textIndex]);
                string[] words = regex.Split(longString);
                Review rev = new Review();
                foreach (string word in words)
                {
                    if (word.Equals(""))
                    {
                        continue;
                    }
                    string stemmed = porterStemmer.stemTerm(word.ToLower());
                    int wordIndex = GetWordIndex(stemmed);
                    rev.AddWord(wordIndex);
                }
                line = sr.ReadLine();
                AddReview(rev);
            }

            sr.Close();

            WriteVocabularyToFile(@"Z:\Desktop\Amazon Reviews\Amazon Data\Vocabulary.txt");
            WriteReviewsToHLDA(@"Z:\Desktop\Amazon Reviews\Amazon Data\Amazon.dat");
        }

        public void GetBookReviews(string file)
        {
            StreamReader sr = new StreamReader(file);
            string line;
            while (!sr.EndOfStream)
            {
                line = sr.ReadLine();
                string[] tokens = line.Split('\t');
                AddBook(new Book(tokens[0].Trim(), "", tokens[1].Trim()));
            }
            sr.Close();
            for (int i = 1; i < 4; i++)
            {
                string reviewfile = string.Format(@"D:\amazondata\reviewsNew{0}.txt", i);
                Console.WriteLine(reviewfile);
                sr = new StreamReader(reviewfile);
                while (!sr.EndOfStream)
                {
                    line = sr.ReadLine();
                    string[] tokens = line.Split('\t');
                    if (HasBook(tokens[1]))
                    {
                        Review rev = new Review(tokens[0].Trim(), tokens[2].Trim(),
                            int.Parse(tokens[3]), int.Parse(tokens[4]),
                            double.Parse(tokens[5]), tokens[6].Trim(), tokens[7].Trim());
                        AddReview(tokens[1], rev);
                    }
                }
                sr.Close();
            }
            WriteReviewsToFile(file.Replace(".txt", " Reviews.txt"));
        }

        public void GetPublisherBookReviews(string publisher)
        {
            string file = string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\{0}.txt", publisher);
            StreamReader sr = new StreamReader(file);
            string line = sr.ReadLine(); //read header
            line = sr.ReadLine();
            while (line != null)
            {
                string[] tokens = line.Split('\t');
                AddBook(new Book(tokens[0].Trim(), tokens[1].Trim(), tokens[2].Trim()));
                line = sr.ReadLine();
            }
            sr.Close();
            for (int i = 1; i < 4; i++)
            {
                file = string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\reviewsNew{0}.txt", i);
                Console.WriteLine(file);
                sr = new StreamReader(file);

                line = sr.ReadLine();
                while (line != null)
                {
                    string[] tokens = line.Split('\t');
                    if (HasBook(tokens[1]))
                    {
                        Review rev = new Review(tokens[0].Trim(), tokens[2].Trim(),
                            int.Parse(tokens[3]), int.Parse(tokens[4]),
                            double.Parse(tokens[5]), tokens[6].Trim(), tokens[7].Trim());

                        AddReview(tokens[1], rev);
                    }
                    line = sr.ReadLine();
                }
                sr.Close();
            }
            WriteReviewsToFile(string.Format(@"Z:\Desktop\Amazon Reviews\Amazon Data\{0} Reviews.txt", publisher));
        }

        static void Main(string[] args)
        {
            Amazon az = new Amazon();
            
            #region Get Books of Publisher 
            //az.GetPublisherBooks("McGraw-Hill Science/Engineering/Math");
            #endregion
            
            #region Get Titles of Publisher Books
            //az.GetPublisherBookTitles("McGraw-Hill Science/Engineering/Math");
            #endregion
            
            #region Get Reviews of Publisher Books
            //az.GetBookReviews(@"..\..\..\..\Books.txt");
            #endregion

            #region Convert Reviews into HLDA format
            //az.ConvertToHLDAFormat("Books Reviews");
            #endregion

            #region Convert Reviews into Separate Files
            //az.ConvertToSeparateFiles("Books Reviews");
            #endregion

            #region Rank Reviews
            az.RankReviews(@"..\..\..\..\RankedReviews2.txt");
            #endregion
        }

        public void RankReviews(string file)
        {
            StreamReader sr = new StreamReader(file);
            string line;
            string[] tokens;
            while (!sr.EndOfStream)
            {
                Book b;
                line = sr.ReadLine(); //read book header
                line = sr.ReadLine();
                tokens = line.Split('\t');
                b = new Book(tokens[0], "", tokens[1]);
                int numReviews = int.Parse(tokens[2]);
                line = sr.ReadLine(); //read the space
                for (int i = 0; i < numReviews; i++)
                {
                    Review rev = new Review();
                    
                    line = sr.ReadLine();
                    tokens = line.Split('\t');
                    rev.authorId = tokens[1];

                    line = sr.ReadLine();
                    tokens = line.Split('\t').Last().Split('/');
                    rev.helpfulVotes = int.Parse(tokens[0]);
                    rev.totalVotes = int.Parse(tokens[1]);

                    line = sr.ReadLine();
                    tokens = line.Split('\t');
                    rev.rating = double.Parse(tokens[1]);

                    line = sr.ReadLine();//read path
                    line = sr.ReadLine();//probabilities
                    tokens = line.Split('\t');
                    HldaDoc doc = new HldaDoc("", tokens.Length - 1);
                    doc.topicProb = new double[tokens.Length - 1];
                    for (int l = 1; l < tokens.Length; l++)
                    {
                        doc.topicProb[l - 1] = double.Parse(tokens[l]);
                    }
                    rev.doc = doc;

                    rev.summary = sr.ReadLine();
                    rev.reviewText = sr.ReadLine();

                    b.AddReview(rev);
                    line = sr.ReadLine();
                }
                AddBook(b);
            }
            //now sort the reviews among each book
            //just print to a tab delimited file
            Console.Write("Helpful Votes\tTotal Votes\t");
            for (int i = 0; i < books.First().Value.reviews.First().doc.topicProb.Length; i++)
            {
                Console.Write("Topic {0}\t", i);
            }
            Console.WriteLine("Summary\tReview");
            foreach (Book b in books.Values)
            {
                foreach (Review r in b.reviews)
                {
                    Console.Write("{0}\t{1}\t", r.helpfulVotes, r.totalVotes);
                    for(int i=0;i<r.doc.topicProb.Length;i++) {
                        Console.Write("{0}\t", r.doc.topicProb[i]);
                    }
                    Console.WriteLine("{0}\t{1}", r.summary, r.reviewText);
                }
            }
        }

        private void ConvertToSeparateFiles(string p)
        {
            Regex regex = new Regex("[^a-zA-Z]");
            PorterStemmer porterStemmer = new PorterStemmer();
            string file = string.Format(@"..\..\..\..\{0}.txt", p);
            Console.WriteLine(file);
            StreamReader sr = new StreamReader(file);

            string line = sr.ReadLine(); //read headers
            line = sr.ReadLine();
            
            while (line != null)
            {
                string[] tokens = line.Split('\t');
                //text is in last two
                int summaryIndex = tokens.Length - 2;
                int textIndex = tokens.Length - 1;

                //now remove punctuations and numbers
                string longString = string.Format("{0} {1}", tokens[summaryIndex], tokens[textIndex]);
                string[] words = regex.Split(longString);
                Review rev = new Review();
                foreach (string word in words)
                {
                    if (word.Equals(""))
                    {
                        continue;
                    }
                    string stemmed = porterStemmer.stemTerm(word.ToLower());
                    int wordIndex = GetWordIndex(stemmed);
                    rev.AddWord(wordIndex);
                }
                line = sr.ReadLine();
                AddReview(rev);
            }
            sr.Close();
        }

        private void WriteReviewsToFiles(string p)
        {
            foreach (Review rev in reviews)
            {
                StreamWriter sw = new StreamWriter(string.Format(@"{0}\{1}_{2}.txt", 
                    p, rev.BookId, rev.authorId));

                foreach (KeyValuePair<int, int> kvp in rev.wordCount)
                {
                    sw.Write(" {0}:{1}", kvp.Key, kvp.Value);
                }
                sw.WriteLine();
                sw.Close();
            }

            
        }

        private void WriteReviewsToHLDA(string p)
        {
            StreamWriter sw = new StreamWriter(p);

            foreach (Review rev in reviews)
            {
                sw.Write("{0}", rev.wordCount.Count);
                foreach (KeyValuePair<int, int> kvp in rev.wordCount)
                {
                    sw.Write(" {0}:{1}", kvp.Key, kvp.Value);
                }
                sw.WriteLine();
            }
            
            sw.Close();
        }

        private void WriteVocabularyToFile(string p)
        {
            StreamWriter sw = new StreamWriter(p);
            int i = 0;
            foreach (string word in vocabulary.Keys)
            {
                Console.WriteLine("{0}\t{1}\t{2}", i++, vocabulary[word], word);
                sw.WriteLine(word);
            }
            sw.Close();
        }

        public void WriteRankedReviews(string file)
        {
            StreamWriter sw = new StreamWriter(file);
            foreach (Book book in books.Values)
            {
                sw.WriteLine("Book ID\tTitle\tNumber of Reviews");
                sw.WriteLine(string.Format("{0}\t{1}\t{2}", book.id, book.title, book.reviews.Count));
                sw.WriteLine();
                foreach (Review rev in book.reviews)
                {
                    rev.doc.CalculateTopicProbability();
                    sw.WriteLine("Author ID\t{0}", rev.authorId);
                    sw.WriteLine("Useful Votes\t{0}/{1}", rev.helpfulVotes, rev.totalVotes);
                    sw.WriteLine("Rating\t{0}", rev.rating);
                    
                    sw.Write("Topic Path");
                    for (int i = 0; i < Global.maxLevel; i++)
                    {
                        sw.Write("\t{0}", rev.doc.path[i].DisplayName);
                    }
                    sw.WriteLine();

                    sw.Write("Topic Path Probabilities");
                    for (int i = 0; i < Global.maxLevel; i++)
                    {
                        sw.Write("\t{0}", rev.doc.topicProb[i]);
                    }
                    sw.WriteLine();

                    sw.WriteLine(rev.summary);
                    sw.WriteLine(rev.reviewText);
                    sw.WriteLine();
                }
            }
            sw.Close();
        }
    }
}
